#ifdef __sw_slave__
#include <simd.h>
#define AUX_FLIP_DEF                                                           \
  int256 flip0 =                                                               \
      simd_set_int256(0x3ffL << 52, 0x3ffL << 52, 0x3ffL << 52, 0x3ffL << 52); \
  int256 flip1 =                                                               \
      simd_set_int256(0xbffL << 52, 0x3ffL << 52, 0x3ffL << 52, 0x3ffL << 52); \
  int256 flip2 =                                                               \
      simd_set_int256(0x3ffL << 52, 0xbffL << 52, 0x3ffL << 52, 0x3ffL << 52); \
  int256 flip3 =                                                               \
      simd_set_int256(0xbffL << 52, 0xbffL << 52, 0x3ffL << 52, 0x3ffL << 52); \
  int256 flip4 =                                                               \
      simd_set_int256(0x3ffL << 52, 0x3ffL << 52, 0xbffL << 52, 0x3ffL << 52); \
  int256 flip5 =                                                               \
      simd_set_int256(0xbffL << 52, 0x3ffL << 52, 0xbffL << 52, 0x3ffL << 52); \
  int256 flip6 =                                                               \
      simd_set_int256(0x3ffL << 52, 0xbffL << 52, 0xbffL << 52, 0x3ffL << 52); \
  int256 flip7 =                                                               \
      simd_set_int256(0xbffL << 52, 0xbffL << 52, 0xbffL << 52, 0x3ffL << 52); \
  int256 aux = simd_set_int256(1L << 63, 1L << 63, 1L << 63, 1L << 63);        \
  asm("wcsr %0, 0x80\n\t" ::"r"(0x6c));

#define BITONIC_INC_4V(v0, v1, v2, v3)                                         \
  {                                                                            \
    int256 t0, t1;                                                             \
    asm("vsubl   %[V0], %[V2], %[VCMP]\n\t"                                    \
        "vcpys   %[VCMP], %[VFLIP3], %[VCMP]\n\t"                              \
        "vsellt  %[VCMP], %[V2], %[V0], %[VSWAP]\n\t"                          \
        "vsellt  %[VCMP], %[V0], %[V2], %[V0]\n\t"                             \
        "vcpys   %[VSWAP], %[VSWAP], %[V2]\n\t"                                \
        "vsubl   %[V1], %[V3], %[VCMP]\n\t"                                    \
        "vcpys   %[VCMP], %[VFLIP3], %[VCMP]\n\t"                              \
        "vsellt  %[VCMP], %[V3], %[V1], %[VSWAP]\n\t"                          \
        "vsellt  %[VCMP], %[V1], %[V3], %[V1]\n\t"                             \
        "vcpys   %[VSWAP], %[VSWAP], %[V3]\n\t"                                \
        : [ V0 ] "=r"(v0), [ V1 ] "=r"(v1), [ V2 ] "=r"(v2), [ V3 ] "=r"(v3),  \
          [ VSWAP ] "=&r"(t0), [ VCMP ] "=&r"(t1)                              \
        : [ AUX ] "r"(aux), [ VFLIP3 ] "r"(flip3), [ R0 ] "0"(v0),             \
          [ R1 ] "1"(v1), [ R2 ] "2"(v2), [ R3 ] "3"(v3));                     \
  }

#define BITONIC_DEC_4V(v0, v1, v2, v3)                                         \
  {                                                                            \
    int256 t0, t1;                                                             \
    asm("vsubl   %[V0], %[V2], %[VCMP]\n\t"                                    \
        "vcpys   %[VCMP], %[VFLIP3], %[VCMP]\n\t"                              \
        "vsellt  %[VCMP], %[V0], %[V2], %[VSWAP]\n\t"                          \
        "vsellt  %[VCMP], %[V2], %[V0], %[V0]\n\t"                             \
        "vcpys   %[VSWAP], %[VSWAP], %[V2]\n\t"                                \
        "vsubl   %[V1], %[V3], %[VCMP]\n\t"                                    \
        "vcpys   %[VCMP], %[VFLIP3], %[VCMP]\n\t"                              \
        "vsellt  %[VCMP], %[V1], %[V3], %[VSWAP]\n\t"                          \
        "vsellt  %[VCMP], %[V3], %[V1], %[V1]\n\t"                             \
        "vcpys   %[VSWAP], %[VSWAP], %[V3]\n\t"                                \
        : [ V0 ] "=r"(v0), [ V1 ] "=r"(v1), [ V2 ] "=r"(v2), [ V3 ] "=r"(v3),  \
          [ VSWAP ] "=&r"(t0), [ VCMP ] "=&r"(t1)                              \
        : [ AUX ] "r"(aux), [ VFLIP3 ] "r"(flip3), [ R0 ] "0"(v0),             \
          [ R1 ] "1"(v1), [ R2 ] "2"(v2), [ R3 ] "3"(v3));                     \
  }
// void bitonic_sort(long *arr, int n) {
//   if (n <= 32) {
//     for (int i = n; i < 32; i ++) {
//       arr[i] = 0x7FFFFFFFFFFFFFFFL;
//     }
//     bitonic_sort_32(arr);
//     return;
//   }
//   if (n <= 64) {
//     for (int i = n; i < 64; i ++) {
//       arr[i] = 0x7FFFFFFFFFFFFFFFL;
//     }
//     bitonic_sort_64(arr);
//     return;
//   }
//   AUX_FLIP_DEF;
//   // long ed, st;
//   // asm volatile("rcsr %0, 4\n\t" : "=r"(st));
  
//   int npad = 128;
//   while (npad < n)
//     npad <<= 1;
//   for (int i = n; i < npad; i ++) {
//     arr[i] = 0x7FFFFFFFFFFFFFFFL;
//   }
//   for (int i = 0; i < npad; i += 128) {
//     bitonic_sort_64_inc(arr + i + 0);
//     bitonic_sort_64_dec(arr + i + 64);
//   }
//   for (int r0s = 7; (1 << r0s) <= npad; r0s++) {
//     int r0 = 1 << r0s;

//     for (int iout = 0; iout < npad; iout += r0) {
//       // printf("iout: %d\n", iout);
//       if (iout >> r0s & 1) {
//         for (int r1 = r0 >> 1; r1 >= 64; r1 >>= 1) {
//           for (int iin = iout; iin < iout + r0; iin += r1 + r1) {
//             // printf("iin, dec: %d\n", iin);
//             for (int i = iin; i < iin + r1; i += 8) {
//               int256 v0, v1, v2, v3;
//               simd_load(v0, arr + i + 0);
//               simd_load(v1, arr + i + 4);
//               simd_load(v2, arr + i + r1 + 0);
//               simd_load(v3, arr + i + r1 + 4);
//               BITONIC_DEC_4V(v0, v1, v2, v3);
//               simd_store(v0, arr + i + 0);
//               simd_store(v1, arr + i + 4);
//               simd_store(v2, arr + i + r1 + 0);
//               simd_store(v3, arr + i + r1 + 4);
//             }
//           }
//         }
//         for (int i = iout; i < iout + r0; i += 64)
//           bitonic_64_dec(arr + i);
//       } else {
//         for (int r1 = r0 >> 1; r1 >= 64; r1 >>= 1) {
//           for (int iin = iout; iin < iout + r0; iin += r1 + r1) {
//             // printf("iin, inc: %d\n", iin);
//             for (int i = iin; i < iin + r1; i += 8) {
//               int256 v0, v1, v2, v3;
//               simd_load(v0, arr + i + 0);
//               simd_load(v1, arr + i + 4);
//               simd_load(v2, arr + i + r1 + 0);
//               simd_load(v3, arr + i + r1 + 4);
//               BITONIC_INC_4V(v0, v1, v2, v3);
//               simd_store(v0, arr + i + 0);
//               simd_store(v1, arr + i + 4);
//               simd_store(v2, arr + i + r1 + 0);
//               simd_store(v3, arr + i + r1 + 4);
//             }
//           }
//         }
//         for (int i = iout; i < iout + r0; i += 64) {
//           bitonic_64_inc(arr + i);
//         }
//       }
//     }
//   }
// }
#include <stdio.h>
#include "esmd_types.h"
// #define max(a, b) ((a) > (b) ? (a):(b))
// #define min(a, b) ((a) < (b) ? (a):(b))
void bitonic_sort(long *tag, int n) {
  int npad = 1;
  while (npad < n) npad <<= 1;
  for (int i = n; i < npad; i ++) {
    tag[i] = 0x7fffffffffffffffL;
  }
  // printf("%d %d\n", n, npad);
  if (npad > 1024) {
    puts("sorting oor");
  }
  for (int r0s = 1; (1 << r0s) <= npad; r0s ++) {
    int r0 = 1 << r0s;
    for (int r1 = r0 >> 1; r1 > 0; r1 >>= 1) {
      for (int iout = 0; iout < npad; iout += r0) {
        // printf("%d %d\n", r0, iout);
        if (iout >> r0s & 1){
          for (int iin = iout; iin < iout + r0; iin += r1 + r1) {
            for (int i = iin; i < iin + r1; i ++) {
              long nmax = max(tag[i], tag[i + r1]);
              long nmin = min(tag[i], tag[i + r1]);
              tag[i] = nmax;
              tag[i + r1] = nmin;
              if (i + r1 >= 1024 || i < 0 || i + r1 < 0 || i >= 1024) puts("sorting oor");
            }
          }
        } else {
          for (int iin = iout; iin < iout + r0; iin += r1 + r1) {
            for (int i = iin; i < iin + r1; i ++) {
              long nmax = max(tag[i], tag[i + r1]);
              long nmin = min(tag[i], tag[i + r1]);
              tag[i] = nmin;
              tag[i + r1] = nmax;
              if (i + r1 >= 1024 || i < 0 || i + r1 < 0 || i >= 1024) puts("sorting oor");
            }
          }
        }
      }
    }
  }
  for (int i = 1; i < n; i ++) {
    if (tag[i] < tag[i - 1]) {
      puts("sort failed!");
    }
  }
  // puts("sort done\n");
}
#endif
#ifdef __sw_host__

#include <assert.h>
#include <stdio.h>
#include "esmd_types.h"
#define max(a, b) ((a) > (b) ? (a):(b))
#define min(a, b) ((a) < (b) ? (a):(b))
void bitonic_sort(long *tag, int n) {
  int npad = 1;
  while (npad < n) npad <<= 1;
  // for (int i = n; i < npad; i ++) {
  //   tag[i] = 0x7fffffffffffffffL;
  // }
  assert(npad <= 1024);
  for (int r0s = 1; (1 << r0s) <= npad; r0s ++) {
    int r0 = 1 << r0s;
    for (int r1 = r0 >> 1; r1 > 0; r1 >>= 1) {
      for (int iout = 0; iout < npad; iout += r0) {
        // printf("%d %d\n", r0, iout);
        if (iout >> r0s & 1){
          for (int iin = iout; iin < iout + r0; iin += r1 + r1) {
            for (int i = iin; i < iin + r1; i ++) {
              long nmax = max(tag[i], tag[i + r1]);
              long nmin = min(tag[i], tag[i + r1]);
              tag[i] = nmax;
              tag[i + r1] = nmin;
              if (i + r1 >= 1024 || i < 0 || i + r1 < 0) puts("sorting oor");
            }
          }
        } else {
          for (int iin = iout; iin < iout + r0; iin += r1 + r1) {
            for (int i = iin; i < iin + r1; i ++) {
              long nmax = max(tag[i], tag[i + r1]);
              long nmin = min(tag[i], tag[i + r1]);
              tag[i] = nmin;
              tag[i + r1] = nmax;
              if (i + r1 >= 1024 || i < 0 || i + r1 < 0) puts("sorting oor");
            }
          }
        }
      }
    }
  }
  for (int i = 1; i < n; i ++) {
    if (tag[i] < tag[i - 1]) {
      puts("sort failed!");
    }
  }
  // puts("sort done\n");
}
#endif